{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import sys\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from tqdm import tqdm_notebook as tqdm\n",
    "from functools import partial\n",
    "from sklearn.externals import joblib\n",
    "%matplotlib inline\n",
    "import seaborn as sns\n",
    "from sklearn.linear_model import LinearRegression\n",
    "\n",
    "sys.path.append('../')\n",
    "from src.utils import parallel_apply\n",
    "from src.feature_extraction import add_features_in_group\n",
    "\n",
    "sys.path.append('../')\n",
    "from src.utils import parallel_apply\n",
    "from src.feature_extraction import add_features_in_group\n",
    "DIR = 'PATH/TO/YOUR/DATA'\n",
    "\n",
    "description = pd.read_csv(os.path.join(DIR,'data/HomeCredit_columns_description.csv'),encoding = 'latin1')\n",
    "application = pd.read_csv(os.path.join(DIR, 'files/unzipped_data/application_train.csv'))\n",
    "pos_cash_balance = pd.read_csv(os.path.join(DIR, 'files/unzipped_data/POS_CASH_balance.csv'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pos_cash_balance.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Preprocessing\n",
    "## Solution 3"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Feature Engineering\n",
    "## Solution 3"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Aggregations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "POS_CASH_BALANCE_AGGREGATION_RECIPIES = []\n",
    "for agg in ['mean', 'min', 'max', 'sum', 'var']:\n",
    "    for select in ['MONTHS_BALANCE',\n",
    "                   'SK_DPD',\n",
    "                   'SK_DPD_DEF'\n",
    "                   ]:\n",
    "        POS_CASH_BALANCE_AGGREGATION_RECIPIES.append((select, agg))\n",
    "POS_CASH_BALANCE_AGGREGATION_RECIPIES = [(['SK_ID_CURR'], POS_CASH_BALANCE_AGGREGATION_RECIPIES)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "groupby_aggregate_names = []\n",
    "for groupby_cols, specs in tqdm(POS_CASH_BALANCE_AGGREGATION_RECIPIES):\n",
    "    group_object = pos_cash_balance.groupby(groupby_cols)\n",
    "    for select, agg in tqdm(specs):\n",
    "        groupby_aggregate_name = '{}_{}_{}'.format('_'.join(groupby_cols), agg, select)\n",
    "        application = application.merge(group_object[select]\n",
    "                              .agg(agg)\n",
    "                              .reset_index()\n",
    "                              .rename(index=str,\n",
    "                                      columns={select: groupby_aggregate_name})\n",
    "                              [groupby_cols + [groupby_aggregate_name]],\n",
    "                              on=groupby_cols,\n",
    "                              how='left')\n",
    "        groupby_aggregate_names.append(groupby_aggregate_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "application.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "application_agg = application[groupby_aggregate_names + ['TARGET']]\n",
    "application_agg_corr = abs(application_agg.corr())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "application_agg_corr.sort_values('TARGET', ascending=False)['TARGET']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Solution 4\n",
    "### Hand crafted features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "features = pd.DataFrame({'SK_ID_CURR': pos_cash_balance['SK_ID_CURR'].unique()})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "group_object = pos_cash_balance.groupby('SK_ID_CURR')['CNT_INSTALMENT_FUTURE'].last().reset_index()\n",
    "group_object.rename(index=str,\n",
    "                    columns={'CNT_INSTALMENT_FUTURE': 'pos_cash_remaining_installments'},\n",
    "                    inplace=True)\n",
    "\n",
    "features = features.merge(group_object, on=['SK_ID_CURR'], how='left')\n",
    "features.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pos_cash_balance['is_contract_status_completed'] = pos_cash_balance['NAME_CONTRACT_STATUS'] == 'Completed'\n",
    "group_object = pos_cash_balance.groupby(['SK_ID_CURR'])['is_contract_status_completed'].sum().reset_index()\n",
    "group_object.rename(index=str,\n",
    "                    columns={'is_contract_status_completed': 'pos_cash_completed_contracts'},\n",
    "                    inplace=True)\n",
    "features = features.merge(group_object, on=['SK_ID_CURR'], how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = application.merge(features, left_on=['SK_ID_CURR'], right_on=['SK_ID_CURR'],\n",
    "                                how='left',\n",
    "                                validate='one_to_one')\n",
    "X = X[features.columns.tolist()+['TARGET']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "engineered_numerical_columns = list(features.columns)\n",
    "engineered_numerical_columns.remove('SK_ID_CURR')\n",
    "credit_eng = X[engineered_numerical_columns + ['TARGET']]\n",
    "credit_eng_corr = abs(credit_eng.corr())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "credit_eng_corr.sort_values('TARGET', ascending=False)['TARGET']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.heatmap(credit_eng_corr, \n",
    "            xticklabels=credit_eng_corr.columns,\n",
    "            yticklabels=credit_eng_corr.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "features.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Solution 5\n",
    "\n",
    "### Hand crafted features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pos_cash_balance['pos_cash_paid_late'] = (pos_cash_balance['SK_DPD'] > 0).astype(int)\n",
    "pos_cash_balance['pos_cash_paid_late_with_tolerance'] = (pos_cash_balance['SK_DPD_DEF'] > 0).astype(int)\n",
    "groupby = pos_cash_balance.groupby(['SK_ID_CURR'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def last_k_installment_features(gr, periods):\n",
    "    gr_ = gr.copy()\n",
    "\n",
    "    features = {}\n",
    "    for period in periods:\n",
    "        if period > 10e10:\n",
    "            period_name = 'all_installment_'\n",
    "            gr_period = gr_.copy()\n",
    "        else:\n",
    "            period_name = 'last_{}_'.format(period)\n",
    "            gr_period = gr_[gr_['MONTHS_BALANCE'] >= (-1) * period]\n",
    "\n",
    "        features = add_features_in_group(features, gr_period, 'pos_cash_paid_late',\n",
    "                                             ['count', 'mean'],\n",
    "                                             period_name)\n",
    "        features = add_features_in_group(features, gr_period, 'pos_cash_paid_late_with_tolerance',\n",
    "                                             ['count', 'mean'],\n",
    "                                             period_name)\n",
    "        features = add_features_in_group(features, gr_period, 'SK_DPD',\n",
    "                                             ['sum', 'mean', 'max', 'min', 'median'],\n",
    "                                             period_name)\n",
    "        features = add_features_in_group(features, gr_period, 'SK_DPD_DEF',\n",
    "                                             ['sum', 'mean', 'max', 'min','median'],\n",
    "                                             period_name)\n",
    "    return features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "features = pd.DataFrame({'SK_ID_CURR': pos_cash_balance['SK_ID_CURR'].unique()})\n",
    "func = partial(last_k_installment_features, periods=[6, 12, 24, 60, 10e16])\n",
    "g = parallel_apply(groupby, func, index_name='SK_ID_CURR', num_workers=10, chunk_size=10000).reset_index()\n",
    "features = features.merge(g, on='SK_ID_CURR', how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "features.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Last loan features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def last_loan_features(gr):\n",
    "    gr_ = gr.copy()\n",
    "    gr_.sort_values(['MONTHS_BALANCE'], ascending=False, inplace=True)\n",
    "    last_installment_id = gr_[gr_['MONTHS_BALANCE'] == gr_['MONTHS_BALANCE'].max()]['SK_ID_PREV']\n",
    "    gr_ = gr_[gr_['SK_ID_PREV'].isin(last_installment_id)]\n",
    "\n",
    "    features={}\n",
    "    features = add_features_in_group(features, gr_, 'pos_cash_paid_late',\n",
    "                                         ['count', 'sum', 'mean'],\n",
    "                                         'last_loan_')\n",
    "    features = add_features_in_group(features, gr_, 'pos_cash_paid_late_with_tolerance',\n",
    "                                         ['sum', 'mean'],\n",
    "                                         'last_loan_')\n",
    "    features = add_features_in_group(features, gr_, 'SK_DPD',\n",
    "                                         ['sum', 'mean', 'max', 'min', 'std'],\n",
    "                                         'last_loan_')\n",
    "    features = add_features_in_group(features, gr_, 'SK_DPD_DEF',\n",
    "                                         ['sum', 'mean', 'max', 'min', 'std'],\n",
    "                                         'last_loan_')\n",
    "    return features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "g = parallel_apply(groupby, last_loan_features, index_name='SK_ID_CURR', num_workers=10, chunk_size=10000).reset_index()\n",
    "features = features.merge(g, on='SK_ID_CURR', how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "features.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Trend features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def trend_in_last_k_installment_features(gr, periods):\n",
    "    gr_ = gr.copy()\n",
    "    gr_.sort_values(['MONTHS_BALANCE'], ascending=False, inplace=True)\n",
    "\n",
    "    features = {}\n",
    "    for period in periods:\n",
    "        gr_period = gr_[gr_['MONTHS_BALANCE'] >= (-1) * period]\n",
    "\n",
    "        features = add_trend_feature(features, gr_period,\n",
    "                                         'SK_DPD', '{}_period_trend_'.format(period)\n",
    "                                         )\n",
    "        features = add_trend_feature(features, gr_period,\n",
    "                                         'SK_DPD_DEF', '{}_period_trend_'.format(period)\n",
    "                                         )\n",
    "    return features\n",
    "\n",
    "def add_trend_feature(features, gr, feature_name, prefix):\n",
    "    y = gr[feature_name].values\n",
    "    try:\n",
    "        x = np.arange(0, len(y)).reshape(-1, 1)\n",
    "        lr = LinearRegression()\n",
    "        lr.fit(x, y)\n",
    "        trend = lr.coef_[0]\n",
    "    except:\n",
    "        trend = np.nan\n",
    "    features['{}{}'.format(prefix, feature_name)] = trend\n",
    "    return features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "func = partial(trend_in_last_k_installment_features, periods=[6, 12, 24, 60])\n",
    "g = parallel_apply(groupby, func, index_name='SK_ID_CURR', num_workers=10, chunk_size=10000).reset_index()\n",
    "features = features.merge(g, on='SK_ID_CURR', how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def last_k_instalment_fractions(old_features, fraction_periods):\n",
    "    features = old_features[['SK_ID_CURR']].copy()\n",
    "    \n",
    "    for short_period, long_period in fraction_periods:\n",
    "        short_feature_names = _get_feature_names(old_features, short_period)\n",
    "        long_feature_names = _get_feature_names(old_features, long_period)\n",
    "        \n",
    "        for short_feature, long_feature in zip(short_feature_names, long_feature_names):\n",
    "            old_name_chunk = '_{}_'.format(short_period)\n",
    "            new_name_chunk ='_{}by{}_fraction_'.format(short_period, long_period)\n",
    "            fraction_feature_name = short_feature.replace(old_name_chunk, new_name_chunk)\n",
    "            features[fraction_feature_name] = old_features[short_feature]/old_features[long_feature]\n",
    "    return features.fillna(0.0)\n",
    "\n",
    "def _get_feature_names(features, period):\n",
    "    return sorted([feat for feat in features.keys() if '_{}_'.format(period) in feat])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "g = last_k_instalment_fractions(features, fraction_periods=[(6, 12), (6, 24), (12,24), (12, 60)])\n",
    "features = features.merge(g, on='SK_ID_CURR', how='left')\n",
    "\n",
    "display(features.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "X = X.merge(features, on='SK_ID_CURR',how='left')\n",
    "X_corr = abs(X.corr())\n",
    "X_corr.sort_values('TARGET', ascending=False)['TARGET']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "regex = '12by60_fraction'\n",
    "X_corr_truncated = X_corr.sort_values('TARGET', ascending=False).filter(regex=regex, axis=0)\n",
    "X_corr_truncated['TARGET']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "colnames = X_corr_truncated.index.tolist() + ['TARGET']\n",
    "sns.heatmap(X_corr_truncated[colnames], \n",
    "            xticklabels=colnames,\n",
    "            yticklabels=colnames)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
